#!/usr/bin/env bash
# Make requests to BLAST from the command line.
# @author: l3nn4rt

print_help() {
    echo -e "$(basename $0) - make requests to BLAST from the command line

SYNOPSIS
    $(basename $0) COMMAND [ARG]

COMMANDS
    create [FILE]   create new request reading from FILE or stdin
    delete RID      remove request and report with given RID
    help            show this help and exit
    info [RID]      show status for request RID or whole local database
    search [FILE]   search request by sequence in the local database
    sync            update local database with new reports from BLAST

DESCRIPTION
    create:  read sequence from the standard input or a file; when input
    ends, send sequence to BLAST and save a copy in the local database.
    To finalize the sequence, type Ctrl+D on a blank line.
    To abort the request creation, type Ctrl+C in any moment.

    delete:  remove from local database both request and report for the
    given request ID; this won't affect remote BLAST request.

    info:  without argument, summarize database status; if a valid request
    ID is provided, show status for the corresponding request. This command
    relies on local database only: you may want to sync first.

    search:  return IDs of all the requests matching the given sequence;
    will return exact matches only, i.e. will not return IDs of requests that
    strictly contain or are contained by the given sequence.

    sync:  download new reports for waiting requests and save in the local
    database. More about a request status below.

REQUEST STATUS
    waiting:  report not found in the local database; you may get new reports
    using command 'sync'.

    ready:  reports available in ./blaster_requests/RID; you can use
    command 'info' to get details for a specific request ID."
}

create_request() {
    # sequence file
    local sequence=$(mktemp)
    [ "$1" = - ] && echo "---- sequence begin ----"
    cat "${1:-}" >$sequence || exit 1
    [ "$1" = - ] && echo "----- sequence end -----"

    # prevent empty sequence
    [ -z "$(cat $sequence)" ] && {
        echo -e "\rError: sequence is empty. Please provide a valid sequence."
        rm $sequence
        exit 1
    }

    # prepare request payload
    local payload=$(mktemp)
    export __SEQUENCE__="$(cat $sequence)"
    cat post-data.templ | envsubst >$payload

    # send request to blast
    printf "Sending request..."
    local reply=$(mktemp)
    curl 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' -sSL -H @post-header.templ --data-binary @$payload >$reply

    # catch request limit error
    grep -q "Cannot accept request, error code: 1" $reply && {
        echo -e "\rError: you reached the waiting requests cap. Please try again later."
        rm $reply
        exit 1
    }

    # extract request id
    rid=$(cat $reply | grep 'Request ID' | sed 's|^.*<b>||; s|</b>.*$||')

    # catch unknown error preventing from receiving request id
    [ -z "$rid" ] && {
        #attachment="$(mktemp -p "$PWD" -t BLASTER-UNKNOWN-ERROR-XXXXXX.html)"
        #mv $reply $attachment
        echo -e "\rError: something went wrong. Will retry."
        exit 1
    }

    echo -e "\rSuccess! Request ID: $rid. (https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&RID=$rid)"

    # save request sequence for future searches
    mkdir -p blaster_requests/$rid
    mv $sequence blaster_requests/$rid/sequence.fasta
    echo $filename > blaster_requests/$rid/filename
    rm $reply
}

delete_request() {
    rm -r blaster_requests/$1 2>/dev/null || echo "Request $1 not found." && false
}

sync_reports() {
    local rids=($(find blaster_requests -mindepth 1 -maxdepth 1 -type d | cut -d/ -f2))
    local count=${#rids[@]}
    local i=0
    local old_matched_count=0
    local old_unmatched_count=0
    local old_failed_count=0
    local waiting_count=0
    local new_matched_rids=()
    local new_unmatched_rids=()
    local new_failed_rids=()

    for rid in ${rids[@]}; do
        ((i++))
        printf "\r[$i / $count] $rid ..."

        # skip requests for which we already have reports
        [ -f blaster_requests/$rid/report.csv ] && {
            ((old_matched_count++))
            continue
        }

        # skip requests previously marked as failed
        [ -f blaster_requests/$rid/.failed ] && {
            ((old_failed_count++))
            continue
        }

        # skip requests previously marked as nomatch
        [ -f blaster_requests/$rid/.nomatch ] && {
            ((old_unmatched_count++))
            continue
        }

        # get request page
        local reply=$(mktemp)
        local response_csv=$(mktemp)
        curl -sSL "https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&RID=$rid" >$reply

        # request failed (creating a new request may help)
        grep -q "Status=FAILED" $reply && {
            touch blaster_requests/$rid/.failed
            new_failed_rids+=($rid)
        }

        # request not ready
        grep -q "Status=WAITING" $reply && {
            ((waiting_count++))
        }

        # request report is ready
        grep -q "Status=READY" $reply && {
            grep -q "No significant similarity found." $reply && {
                # no match found for the sequence
                touch blaster_requests/$rid/.nomatch
                new_unmatched_rids+=($rid)
            } || {
                # download report in csv format and generate tsv from it
                while true
                do
                    curl -sSL "https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&RID=$rid&RESULTS_FILE=on&FORMAT_TYPE=CSV&DESCRIPTIONS=10&FORMAT_OBJECT=Alignment&QUERY_INDEX=0&DOWNLOAD_TEMPL=Results&ALIGNMENT_VIEW=Pairwise&QUERY_INDEX=0&CONFIG_DESCR=2,3,6,7,8,9,10,11,12" > $response_csv 2>/dev/null
                    if [ $? -eq 0 ];then
                        cat $response_csv > blaster_requests/$rid/report.csv
                        break
                    fi
                    sleep 1
                done
                new_matched_rids+=($rid)
            }
        }

        rm $reply
        rm $response_csv
    done

    local old_count=$[ $old_matched_count + $old_unmatched_count + $old_failed_count ]
    local new_count=$[ ${#new_matched_rids[@]} + ${#new_unmatched_rids[@]} + ${#new_failed_rids[@]} ]


    echo -e "\rDatabase updated: $count queries found"
    echo   "----------- old --- new ---"
    printf "matched:    %3d     %3d\n"  $old_matched_count      ${#new_matched_rids[@]}
    printf "unmatched:  %3d     %3d\n"  $old_unmatched_count    ${#new_unmatched_rids[@]}
    printf "failed:     %3d     %3d\n"  $old_failed_count       ${#new_failed_rids[@]}
    echo   "---------------------------"
    printf "ready:      %3d     %3d\n"  $old_count              $new_count
    printf "waiting:    %3d\n"          $waiting_count

    ((total=$old_count+$new_count))
    if [ $total -eq $count ]; then
        exit 0
    else
        exit 1
    fi

}

search_by_content() {
    # searched sequence
    local tmp=$(mktemp)
    [ "$1" = "-" ] && echo "---- sequence begin ----"
    cat "${1:-}" >$tmp
    [ "$1" = "-" ] && echo "----- sequence end -----"
    local found=0
    # compare with existing requests
    for fn in $(find blaster_requests -type f -name sequence.fasta); do
        diff $tmp $fn &>/dev/null && {
            basename $(dirname $fn)
            found=1
        }
    done
    rm $tmp
    # set exit status
    [ $found = 1 ]
}

show_details() {
    # database info
    [ $# -eq 0 ] && {
        local rids=$(mktemp)
        find blaster_requests -mindepth 1 -maxdepth 1 -type d | cut -d/ -f2 | sort >$rids
        local rids_done=$(mktemp)
        find blaster_requests -mindepth 2 -maxdepth 2 -type f -name report.\* | cut -d/ -f2 | sort -u >$rids_done
        local rids_failed=$(mktemp)
        find blaster_requests -mindepth 2 -maxdepth 2 -type f -name .failed | cut -d/ -f2 >$rids_failed

        {   echo -e "waiting\tready";
            comm -123 --total $rids $rids_done | cut -f 1,3; \
        } | column -t

        rm $rids $rids_done
    # request info
    } || {
        while [ $# -gt 0 ]; do
            local rid=$1
            [ -d blaster_requests/$rid ] || {
                echo "Error: can't find a request with ID: $rid"
                shift
                [ $# -gt 0 ] && echo
                continue
            }

            echo     "Sequence:     $PWD/blaster_requests/$rid/sequence.fasta"

            # ready: we have both the reports
            [ -f blaster_requests/$rid/report.csv -a -f blaster_requests/$rid/report.txt ] && {
                echo "Report CSV:   $PWD/blaster_requests/$rid/report.csv"
                echo "Report TXT:   $PWD/blaster_requests/$rid/report.txt"
                echo "Status:       ready"
            }
            # failed: we marked previously
            [ -f blaster_requests/$rid/.failed ] &&
                echo "Status:       failed (try creating a new request)"
            # nomatch: we marked previously
            [ -f blaster_requests/$rid/.nomatch ] &&
                echo "Status:       unmatched"
            # waiting: request folder contains sequence only
            [ "$(find blaster_requests/$rid -type f)" = "blaster_requests/$rid/sequence.fasta" ] &&
                echo "Status:       waiting"

            echo     "Origin:       https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&RID=$rid)"

            shift
            [ $# -gt 0 ] && echo
        done
    }
}

###### main ######
mkdir -p blaster_requests

case "$1" in
    create) create_request "${2--}";;
    delete) delete_request "$2";;
    help)   print_help; exit;;
    info)   show_details "${@:2}";;
    search) search_by_content "${2--}";;
    sync)   sync_reports;;
    *)      print_help; exit 1;;
esac